/*********************************************************
This imports and mergesdata from Callfire from the May 2022 
roll out

Last updated: May 15th, 2025
*********************************************************/

//Goal to match 2 datasets on unique phone numbers 10 digit 
//1. Callfire calls 1 per phone number (need to clean a little extra)
clear all
	cd "$data/2_field_experiment/may data"
	use "ImportCallfireMay22_ONEcall.dta"
		drop if fromnumber=="D0000251251" 
		unique fromnumber // still some duplicates, keep 1st one in timeline
		sort fromnumber numtime
		bysort fromnumber: gen drop2=_n
		drop if drop2==2
		drop drop2
		//Any of following can be what we match on
		unique fromnumber
		unique num_fromnumber
		unique ContactNameFormat
		sort num_fromnumber
	save CallfireMay22_Unique_Phonenum.dta, replace
		
//2. Full dataset
clear all
import delimited "$data/2_field_experiment/principals data/firstphaseranmerged.csv"
	//Drop those with duplicates emails sent to them
	gen fromnumber = phone
	gen num_fromnumber = fromnumber 	
	bysort emailaddress: gen dup_emails=_N	
	drop if dup_emails>1	
	//Look for multiple phonenumbers
	bysort fromnumber: gen dup_phone=_N	
	drop if dup_phone>1
	//tabstat treatmentnumber, by(treatment) format(%9.2fc) stat(N) labelwidth(32)
	sort fromnumber //7,935 emails went out to a unique phone AND unique email
	gen ContactName7=substr(fromnumber,1 , 7)
	egen num_ContactName7 = count(ContactName7), by(ContactName7) // 4,994 occur a single time, rest have duplicates
	save "$data/2_field_experiment/principals data/firstphaseranmerged_noDups.dta", replace
//3. Now merge the 2 datasets	
	cd "$data/2_field_experiment/may data"
	*tostring num_fromnumber, replace force format(%20.0f)

	merge 1:1 fromnumber using CallfireMay22_Unique_Phonenum.dta // matched 854
	savesome if _merge==2 using "$data/2_field_experiment/may data/May2022_UnmatchedFrom10.dta", replace
	savesome if _merge==3 using "$data/2_field_experiment/may data/00_May2022_MatchedFrom10.dta", replace 
	drop if _merge==2
	gen temp_merge=_merge
	drop _merge
	savesome if temp_merge==1 using "$data/2_field_experiment/may data/firstphaseranmerged_noDups_UnmatchedFrom10.dta", replace	//7081 schools still yet to be matched
	save "$data/2_field_experiment/may data/May2022_MatchCallfire2Principals_10digit.dta", replace 

//---------Take the 800 calls that were unmatched
clear all
cd "$data/2_field_experiment/may data"
use May2022_UnmatchedFrom10.dta
drop treatmentnumber - dfname // drop the empty variables
drop _merge 
drop ContactName7 
gen ContactName7=substr(fromnumber,1 , 7)
	egen num_ContactName7_C = count(ContactName7), by(ContactName7) // 662 occur a single time, rest have duplicates
unique CallfireID // 800 unique CallfireIDs, endgoal is to ideally link each of these 800 to a single principal email sent	
sort ContactName7
	merge m:m ContactName7 using firstphaseranmerged_noDups_UnmatchedFrom10.dta // matched 592
	drop if _merge==2 // drop the Principals data that isn't matched
unique CallfireID // 800 unique CallfireIDs but now 981 records
unique CallfireID if _merge==3 // 411 unique CallfireIDs with 592 records	
	count if num_ContactName7==1&num_ContactName7_C==1&_merge==3 
	
	gen CheckMatchLater=0
	gen MaleHigh=0
		gen MaleHigh_P=0
			replace MaleHigh_P=1 if strpos(treatment, "Roy high")>0
		gen MaleHigh_C=(Treatment==1)
		count if MaleHigh_P==0&MaleHigh_C==1
		replace CheckMatchLater=1 if MaleHigh_P==0&MaleHigh_C==1
		replace MaleHigh=1 if strpos(treatment, "Roy high")
	gen MaleLow=0
		gen MaleLow_P=0 
			replace MaleLow_P=1 if strpos(treatment, "Roy low")
		gen MaleLow_C=(Treatment==2)
		replace CheckMatchLater=1 if MaleLow_P==0&MaleLow_C==1
		replace MaleLow=1 if strpos(treatment, "Roy low")	
	gen FemaleHigh=0
		gen FemaleHigh_P=0
			replace FemaleHigh_P=1 if strpos(treatment, "Erica high")
		gen FemaleHigh_C=(Treatment==3)
		replace CheckMatchLater=1 if FemaleHigh_P==0&FemaleHigh_C==1
		replace FemaleHigh=1 if strpos(treatment, "Erica high")	
	gen FemaleLow=0
		gen FemaleLow_P=0
		replace FemaleLow_P=1 if strpos(treatment, "Erica low")
		gen FemaleLow_C=(Treatment==4)
		replace CheckMatchLater=1 if FemaleLow_P==0&FemaleLow_C==1
		replace FemaleLow=1 if strpos(treatment, "Erica low")		
	
	sort _merge CallfireID ContactName7 num_ContactName7 num_ContactName7_C CheckMatchLater
	/*brow CallfireID ///
		ContactName7 num_ContactName7 num_ContactName7_C ///
		fromnumber link ///
		Variation Treatment treatment CheckMatchLater ///
		schoolname firstname lastname locationcity locationstate 
	*/
*export excel CallfireID /// DH change to not run
		ContactName7 num_ContactName7 num_ContactName7_C ///
		fromnumber link ///
		Variation Treatment treatment CheckMatchLater ///
		emailaddress schoolname locationcity locationstate  ///
		num_fromnumber dup_emails dup_phone tonumber state result answertime duration Wave num_phone FemaleNum FemalePrincipal MomListedFirst ContactName6 ContactNameFormat6 ContactNameFormat date numtime phoneID PhoneNumPair DaysSinceStartingEmails NumCalls CallOrder ContactName7 num_ContactName7_C treatmentnumber treatment treatmentname genderofprincipal signature signaturenumber fromperson cc message1 message2 v1 x v3 v2 v5 personnelid titlesid ncesschoolid schoolname districtid districtname schoolid name firstname lastname title emailaddress phone fax locationaddress locationcity locationstate locationzip_code_anon lowgrade highgrade gradestatus whetheracharterschool totalstudents nurseryprekindergartenoffered kindergartenoffered firstgradeoffered secondgradeoffered thirdgradeoffered fourthgradeoffered fifthgradeoffered sixthgradeoffered seventhgradeoffered eighthgradeoffered ninthgradeoffered tenthgradeoffered eleventhgradeoffered twelfthgradeoffered teacherstotal numberofschools expendituresperstudent virtualschoolstatus localecode metropolitanormicropolitanstatis totaloffreeluncheligibleandreduc totalstudentseligibletoparticipa v53 allstudentsamericanindianalaskan allstudentsasian allstudentshispanic allstudentsblack allstudentswhite allstudentshawaiiannativepacific classroomteachers titleistatus nationalschoollunchprogramstatus titleistatusprogram titleieligibleschool schoolwidetitlei permanentidentificationnumber principalname address statefull locationzip4 schooltypology religion schoolorientation diocese schooltype gradelevel totalnumberofstudents sizeofschoolk12ug numberofk12teachers urbancentriccommunitytype lengthofschooldayinhours v83 proportion_male proportion_female gender year_min year_max ra_hand_code v90 v91 prob_asian prob_black prob_hispanic prob_white race region region_name public dfname num_ContactName7 temp_merge _merge CheckMatchLater MaleHigh MaleHigh_P MaleHigh_C MaleLow MaleLow_P FemaleHigh FemaleHigh_C FemaleLow FemaleLow_P FemaleLow_C ///
		using ForRAsToMatch_Step2, sheetreplace firstrow(variables)
		clear all
		
*****************************************************************************************
* In the section above, the data was exported and then incorrect merges were manually
* flagged by listening to the voicemail data from each observation's phone number.
*****************************************************************************************

clear all
import excel "$data/2_field_experiment/may data/ForRAsToMatch_Step2 (May 2022).xlsx", sheet("Callfire Matches") firstrow
		//only run code below on sheet after RAS are finished
		drop if CorrectMatch!=1
		drop if MatchedEmail=="*" 
		drop if MatchedEmail=="**" 
		drop if MatchedEmail=="" 
		drop if MatchedEmail=="NA" 
		//only run code above on sheet after RAs are finished
	bysort CallfireID: gen check=_N // should be 1 for all
	drop check	
	//This dataset now contains 492 lines of CallFire matched to a single School
	gen FemaleNum_num=(FemaleNum=="Female Number (Mom)")
	drop FemaleNum
	rename FemaleNum_num FemaleNum
	drop emailaddress
	rename MatchedEmail emailaddress
	//keep emailaddress FemaleNum CallfireID
	sort emailaddress CallfireID
	bysort emailaddress: gen check=_n	
	drop if check>1 //keeps the 1st call made by the emailaddress
	drop check
	append using "$data/2_field_experiment/may data/00_May2022_MatchedFrom10.dta", force
	rename fromnumber fromnumber_2
	//rename answertime answertime_2
	keep emailaddress FemaleNum phoneID CallfireID tonumber answertime numtime link fromnumber_2 // 1330 unique email addresses matched
	merge 1:1 emailaddress using "$data/2_field_experiment/principals data/firstphaseranmerged_noDups.dta"
	
	//if RAs have only done 6 digit match run below
*export excel ContactName7 ///
		num_ContactName7  ///
		fromnumber ///
		treatment ///
		emailaddress schoolname firstname lastname locationcity locationstate  ///
		num_fromnumber dup_emails dup_phone state treatmentnumber treatment treatmentname genderofprincipal signature signaturenumber fromperson cc message1 message2 v1 x v3 v2 v5 personnelid titlesid ncesschoolid schoolname districtid districtname schoolid name firstname lastname title emailaddress phone fax locationaddress locationcity locationstate locationzip locationzipplus4 lowgrade highgrade gradestatus whetheracharterschool totalstudents nurseryprekindergartenoffered kindergartenoffered firstgradeoffered secondgradeoffered thirdgradeoffered fourthgradeoffered fifthgradeoffered sixthgradeoffered seventhgradeoffered eighthgradeoffered ninthgradeoffered tenthgradeoffered eleventhgradeoffered twelfthgradeoffered teacherstotal numberofschools expendituresperstudent virtualschoolstatus localecode metropolitanormicropolitanstatis totaloffreeluncheligibleandreduc totalstudentseligibletoparticipa v53 allstudentsamericanindianalaskan allstudentsasian allstudentshispanic allstudentsblack allstudentswhite allstudentshawaiiannativepacific classroomteachers titleistatus nationalschoollunchprogramstatus titleistatusprogram titleieligibleschool schoolwidetitlei permanentidentificationnumber principalname address statefull locationzip4 schooltypology religion schoolorientation diocese schooltype gradelevel totalnumberofstudents sizeofschoolk12ug numberofk12teachers urbancentriccommunitytype lengthofschooldayinhours v83 proportion_male proportion_female gender year_min year_max ra_hand_code v90 v91 prob_asian prob_black prob_hispanic prob_white race region region_name public dfname num_ContactName7 ///
		using ForRAsToMatch_Step3 phoneID if _merge!=3, sheetreplace firstrow(variables)
	//if RAs have only done 6 digit match run above
	
	gen Variation=.
	replace Variation=0 if strpos(treatment, "Main")&Variation==.
	replace Variation=1 if strpos(treatment, "Payment")&Variation==.
	replace Variation=2 if strpos(treatment, "Constant")&Variation==.
	replace Variation=3 if strpos(treatment, "FT")&Variation==.
	gen Treatment=.
	replace Treatment=1 if strpos(treatment, "Roy high")&Treatment==.
	replace Treatment=2 if strpos(treatment, "Roy low")&Treatment==.
	replace Treatment=3 if strpos(treatment, "Erica high")&Treatment==.
	replace Treatment=4 if strpos(treatment, "Erica low")&Treatment==.	
	replace Treatment=0 if Treatment==.
	
	gen WhoCalled=.
	replace WhoCalled=0 if FemaleNum==.
	replace WhoCalled=1 if FemaleNum==1
	replace WhoCalled=2 if FemaleNum==0
	label define WhoCalledl ///
		0 "No Call" ///
		1 "Female Call" ///
		2 "Male Call"	
	label values WhoCalled WhoCalledl	

gen NoCall=0
	replace NoCall=1 if FemaleNum==.
gen FemaleNum0=FemaleNum
	replace FemaleNum0=0 if FemaleNum==.
gen MaleNum=1-FemaleNum	
gen MaleNum0=MaleNum
	replace MaleNum0=0 if MaleNum==.
	
gen order=.
replace order=-2 if Treatment==1
replace order=-1 if Treatment==4
replace order=0 if Treatment==0
replace order=1 if Treatment==2
replace order=2 if Treatment==3	

label values Variation Variationl
label values Treatment Treatl

	
	egen group = group(Variation Treatment), label


cd "$data/2_field_experiment/may data"
drop _merge
tostring phoneID, replace
save May2022_MatchedALL.dta, replace

//Create Dataset with multiple lines for multiple calls
	//Issue: Ideally would use CallfireID but since that variable is created as a running count if using an updated dataset it doesn't connect correctly
	//Workaround - trying to link on something uniquely identifying in Callfire data (e.g. fromnumber + answertime)

//First prep the merging dataset for a work-around given can't match on CallfireID 
clear all
use "$data/2_field_experiment/may data/Callfire Spreadsheets/ImportCallfireMay22.dta"
unique fromnumber answertime
bysort fromnumber answertime: gen temp_drop=_N
//brow if temp_drop==2
drop if temp_drop==2&duration==0
save "$data/2_field_experiment/may data/Callfire Spreadsheets/temp1_ImportCallfireMay22.dta", replace
drop temp_drop
bysort link: gen temp_drop=_N
drop if temp_drop>1
replace link = subinstr(link, " ", "", .)
save "$data/2_field_experiment/may data/Callfire Spreadsheets/temp2_ImportCallfireMay22.dta", replace

//Next Open the 1 call dataset we created	
clear all
cd "$data/2_field_experiment/may data"
use May2022_MatchedALL.dta, replace
drop if CallfireID==. //drop principals who didn't get a call matched to them
rename CallfireID CallfireID_2
rename link link_2
keep emailaddress CallfireID_2 fromnumber fromnumber_2 answertime link_2
rename fromnumber fromnumber_3
rename fromnumber_2 fromnumber
duplicates drop
gen restricted=1 if fromnumber=="D0000251251" 
replace fromnumber=fromnumber_3 if fromnumber=="D0000251251" //Minor issue since Restricted Numbers don't have a fromnumber
	//Merge with the Multi phone call data
	merge 1:1 fromnumber answertime using "$data/2_field_experiment/may data/Callfire Spreadsheets/temp1_ImportCallfireMay22.dta"
	savesome if _merge==3 using "$data/2_field_experiment/may data/00_May2022_MultiCalls.dta", replace
	//brow if _merge==1 // ideally there are no _merge==1, but there are 7
	keep if _merge==1
	keep emailaddress CallfireID_2 fromnumber answertime restricted link_2
	rename link_2 link
	replace link = subinstr(link, " ", "", .)
	merge 1:1 link using "$data/2_field_experiment/may data/Callfire Spreadsheets/temp2_ImportCallfireMay22.dta"
	keep if _merge==3
	append using "$data/2_field_experiment/may data/00_May2022_MultiCalls.dta"
	save "$data/2_field_experiment/may data/01_May2022_MultiCalls.dta", replace
	unique phoneID  //1330 lines of data each a single 1st call linked to a single emails
	unique emailaddress //1330
	//phoneID is a unique identfier for each unique phone number (or restricted call). 
	//We want to create a dataset with emailaddress as the unique identifier, but mutliple lines for each call made in response to a single email
	//At present this dataset only links email to the 1st call made
		//key mapping is between email address and phoneID
		rename CallfireID check_CallfireID 
		rename answertime check_answertime 
		rename fromnumber check_fromnumber
	sort phoneID emailaddress 
	keep phoneID emailaddress check_CallfireID check_answertime check_fromnumber
	merge 1:m phoneID using "$data/2_field_experiment/may data/Callfire Spreadsheets/ImportCallfireMay22.dta" 
	//brow if _merge==2 //these are likley mostly school we dropped due to duplicate emails being sent to them or SPAM calls
	count if _merge==1 //should be 0
	keep if _merge==3
	drop _merge
	//Do some checks to see if match looks good
	count if CallfireID==check_CallfireID
	count if answertime==check_answertime
		//brow if answertime!=check_answertime
	count if fromnumber==check_fromnumber	
	drop phone dup_emails num_phone
	//Create Outcome Variables of Interest
	rename CallOrder CallOrder_old
	sort emailaddress numtime
	bysort emailaddress: gen CallOrder=_n
	bysort emailaddress: gen TotalCalls=_N
	
	gen temp1_CallMomFirst=0
		replace temp1_CallMomFirst=1 if CallOrder==1&FemaleNum==1
		bysort emailaddress: egen temp2_CallMomFirst=max(temp1_CallMomFirst)
		rename temp2_CallMomFirst CallMomFirst
		drop temp*
	gen temp1_Call_1Mom_2Dad=0
		replace temp1_Call_1Mom_2Dad=1 if CallMomFirst==1&CallOrder==2&FemaleNum==0
		bysort emailaddress: egen temp2_Call_1Mom_2Dad=max(temp1_Call_1Mom_2Dad)
		rename temp2_Call_1Mom_2Dad Call_1Mom_2Dad
		drop temp*
	gen temp1_Call_1Mom_2Mom=0
		replace temp1_Call_1Mom_2Mom=1 if CallMomFirst==1&CallOrder==2&FemaleNum==1
		bysort emailaddress: egen temp2_Call_1Mom_2Mom=max(temp1_Call_1Mom_2Mom)
		rename temp2_Call_1Mom_2Mom Call_1Mom_2Mom
		drop temp*
	gen temp1_Call_1Dad_2Mom=0
		replace temp1_Call_1Dad_2Mom=1 if CallMomFirst==0&CallOrder==2&FemaleNum==1
		bysort emailaddress: egen temp2_Call_1Dad_2Mom=max(temp1_Call_1Dad_2Mom)
		rename temp2_Call_1Dad_2Mom Call_1Dad_2Mom
		drop temp*
	gen temp1_Call_1Dad_2Dad=0
		replace temp1_Call_1Dad_2Dad=1 if CallMomFirst==0&CallOrder==2&FemaleNum==0
		bysort emailaddress: egen temp2_Call_1Dad_2Dad=max(temp1_Call_1Dad_2Dad)
		rename temp2_Call_1Dad_2Dad Call_1Dad_2Dad
		drop temp*	
	gen OneCallToMom=0
		replace OneCallToMom=1 if TotalCalls==1&FemaleNum==1
	gen OneCallToDad=0
		replace OneCallToDad=1 if TotalCalls==1&FemaleNum==0
	tostring num_fromnumber, replace force format(%20.0f)

	tostring phoneID, replace
	merge m:1 emailaddress using "$data/2_field_experiment/may data/May2022_MatchedALL.dta"
	drop _merge
	unique emailaddress
	save "$data/2_field_experiment/may data/May2022_Matched_1_line_per_call.dta", replace 
	

//Datasets
//1. Single Line For Each School
// use "$data/2_field_experiment/MergedDataSets/2022_05_May2022/May2022_MatchedALL.dta"
//2. Multiple Lines For Each School-Call
//use "$data/2_field_experiment/MergedDataSets/2022_05_May2022/May2022_Matched_1_line_per_call.dta"


//-----------Start here 12/8/2022
//Want to add a line about how the match was done
//Download this spreadsheet
	clear all
	cd "$data/2_field_experiment/may data"	
	//A. 10 digit matches: 854 10 digit phone-call:school matches
	import excel "ForRAsToMatch_Step2 (May 2022).xlsx", sheet("Matched 10 digit") firstrow
	drop if emailaddress==""
	unique emailaddress
	unique CallfireID
	gen email=subinstr(emailaddress, " ", "", .)
	gen lemail=lower(emailaddress)
	keep CallfireID lemail numtime tonumber
	gen MatchType="10 digit"
	save 00_May2022_MatchedFrom10_crosswalk.dta, replace //saves CallfireID lemail
	clear
	//B. Restricted Numbers
	clear
	import excel "ForRAsToMatch_Step2 (May 2022).xlsx", sheet("Callfire Matches") firstrow	
	keep if fromnumber=="D0000251251"
	drop if MatchedEmail=="*"
	unique MatchedEmail
	unique CallfireID
	gen email=subinstr(MatchedEmail, " ", "", .)
	gen lemail=lower(email)
	sort lemail numtime
	bysort lemail: gen Keep1st=_n
	keep if Keep1st==1
	unique lemail
	unique CallfireID
	keep CallfireID lemail numtime	tonumber
	gen MatchType="Restricted"
	save 01_May2022_MatchedRestricted_crosswalk.dta, replace	
	//C. Fuzzy Match (aka 6 digit match), started with XXX unique phone calls, matched 276 to a school but sometimes same school has 2+ calls
	clear
	import excel "ForRAsToMatch_Step2 (May 2022).xlsx", sheet("Callfire Matches") firstrow
	drop if fromnumber=="D0000251251"
	keep if schoolname!="" //these were the "fuzzy match" ones
	keep if CorrectMatch!=0
	unique MatchedEmail
	unique CallfireID
	gen email=subinstr(MatchedEmail, " ", "", .)
	gen lemail=lower(email)
	sort lemail numtime
	bysort lemail: gen Keep1st=_n
	keep if Keep1st==1
	unique lemail
	unique CallfireID
	keep CallfireID lemail numtime	tonumber
	gen MatchType="Fuzzy"
	save 02_May2022_MatchedFuzzy_crosswalk.dta, replace
	//D. NoMatch, started with xxx unique phone calls, matched xxx to a school but sometimes same school has 2+ calls, so xxx unique schools
	clear
	import excel "ForRAsToMatch_Step2 (May 2022).xlsx", sheet("Callfire Matches") firstrow
	drop if fromnumber=="D0000251251"
	drop if schoolname!=""
	keep if CorrectMatch!=0
	unique MatchedEmail
	unique CallfireID
	gen email=subinstr(MatchedEmail, " ", "", .)
	gen lemail=lower(email)
	sort lemail numtime
	bysort lemail: gen Keep1st=_n
	keep if Keep1st==1
	unique lemail
	unique CallfireID
	keep CallfireID lemail numtime	tonumber
	gen MatchType="No Match"
	save 03_May2022_MatchedNoMatch_crosswalk.dta, replace
	//STACK crosswalks
	clear all
	cd "$data/2_field_experiment/may data"	
	use  00_May2022_MatchedFrom10_crosswalk.dta
	append using 01_May2022_MatchedRestricted_crosswalk.dta
	append using 02_May2022_MatchedFuzzy_crosswalk.dta
	append using 03_May2022_MatchedNoMatch_crosswalk.dta
	drop if lemail==""
	drop if lemail=="*"
	drop if lemail=="**"	
	unique lemail //
	unique CallfireID //
	sort lemail numtime
	bysort lemail: gen Keep1st=_n
	keep if Keep1st==1
	unique lemail //1331
	unique CallfireID //1331
	rename numtime numtime_crosswalk
	rename CallfireID CallfireID_crosswalk
	drop Keep1st
	drop if lemail=="na"
	save May2022_All_crosswalk.dta, replace
//Link to already matched dataset
	clear
	cd "$data/2_field_experiment/may data"	
	use May2022_MatchedALL.dta
	gen email=subinstr(emailaddress, " ", "", .)
	gen lemail=lower(emailaddress)
	merge 1:1 lemail using "$data/2_field_experiment/may data/May2022_All_crosswalk.dta"
	rename _merge crosswalk_merge	
	save May2022_MatchedALL.dta, replace
